VSURF

Author

Miguel Fudolig

library(tidyverse)
library(ggplot2)
library(lavaan)
library(car)
library(caret)
library(ggRandomForests)
library(VSURF)
library(glmnet)
library(Boruta)
library(doParallel)

Data set

This data set is from the 2015 Asian American Quality of Life survey. Participants are from Austin, Texas.

Input data set

qol <- read_csv("AAQoL.csv") |> mutate(across(where(is.character), ~as.factor(.x))) |> 
  mutate(`English Difficulties`=relevel(`English Difficulties`,ref="Not at all"),
         `English Speaking`=relevel(`English Speaking`,ref="Not at all"),
         Ethnicity = relevel(Ethnicity,ref="Chinese"),
         Religion=relevel(Religion,ref="None")) |> 
  mutate(Income_median = case_match(Income,"$0 - $9,999"~"Below",
                                         "$10,000 - $19,999" ~"Below",
                                         "$20,000 - $29,999"~"Below",
                                         "$30,000 - $39,999"~"Below",
                                         "$40,000 - $49,999"~"Below",
                                         "$50,000 - $59,999"~"Below",
                                         "$60,000 - $69,999"~"Above",
                                         "$70,000 and over"~"Above",
                                          .default=Income)) |> 
  mutate(Income_median = factor(Income_median, levels=c("Below","Above"))) |> 
  mutate(across(`Familiarity with America`:`Familiarity with Ethnic Origin`,~factor(.x,levels=c("Very low","Low", "High", "Very high"))),
         across(`Identify Ethnically`,~factor(.x,levels=c("Not at all","Not very close","Somewhat close","Very close"))),
         across(`Belonging`,~factor(.x,levels=c("Not at all","Not very much","Somewhat","Very much"))),
         `Primary Language` = as.factor(`Primary Language`))
New names:
Rows: 2609 Columns: 231
── Column specification
──────────────────────────────────────────────────────── Delimiter: "," chr
(190): Gender, Ethnicity, Marital Status, No One, Spouse, Children, Gran... dbl
(41): Survey ID, Age, Education Completed, Household Size, Grandparent,...
ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
Specify the column types or set `show_col_types = FALSE` to quiet this message.
• `Other` -> `Other...17`
• `Other` -> `Other...89`
qol |> DT::datatable()
Warning in instance$preRenderHook(instance): It seems your data is too big for
client-side DataTables. You may consider server-side processing:
https://rstudio.github.io/DT/server.html

Family

rfdata <- qol |> filter(Family %in% c("No","Yes")) |> 
  mutate(Family=droplevels(Family)) |> 
  select(Family, Ethnicity, Age, Gender,Religion, `Full Time Employment`,  Income_median, `US Born`:`Discrimination`,`Health Insurance`,`Dental Insurance`) |> 
    na.omit() |>
  as.data.frame() |> 
  rename_with(make.names)

imbal <- ROSE::ROSE(Family~.,
                          data=rfdata,
                          seed=3)$data


# VSURF(Family~.,imbal,na.action="na.omit",parallel=T,verbose=F)->vsurf.mod
VSURF(Family~.,rfdata,na.action="na.omit",parallel=T,verbose=F)->vsurf.mod
Warning in VSURF.formula(Family ~ ., rfdata, na.action = "na.omit", parallel = T, : VSURF with a formula-type call outputs selected variables
which are indices of the input matrix based on the formula:
you may reorder these to get indices of the original data
vsurf.mod |> summary()

 VSURF computation time: 11.6 secs 

 VSURF selected: 
    18 variables at thresholding step (in 6.2 secs)
    3 variables at interpretation step (in 3.6 secs)
    3 variables at prediction step (in 1.7 secs)

 VSURF ran in parallel on a PSOCK cluster and used 15 cores 
names(rfdata[,-1])[vsurf.mod$varselect.pred]
[1] "Age"                  "Ethnicity"            "English.Difficulties"
names(rfdata[,-1])[vsurf.mod$varselect.interp]
[1] "Age"                  "Ethnicity"            "English.Difficulties"
plot(vsurf.mod)

vsurf.mod$mean.perf
[1] 0.4059747

Importance

vi<- data.frame(Variable=names(rfdata[,-1])[vsurf.mod$imp.mean.dec.ind],
                Importance = vsurf.mod$imp.mean.dec,
                sd_Importance = vsurf.mod$imp.sd.dec
)|> 
  mutate(fill = case_when(Variable=="Ethnicity"~"red",
                                                 .default="black"))

vi |> mutate(across(Importance:sd_Importance,~round(.x,5)))
                         Variable Importance sd_Importance  fill
1                             Age    0.02655       0.00070 black
2                       Ethnicity    0.01326       0.00093   red
3            English.Difficulties    0.00791       0.00061 black
4           Duration.of.Residency    0.00531       0.00071 black
5                         US.Born    0.00429       0.00041 black
6                English.Speaking    0.00305       0.00054 black
7            Full.Time.Employment    0.00286       0.00045 black
8                          Gender    0.00269       0.00052 black
9                Primary.Language    0.00262       0.00039 black
10                      Belonging    0.00240       0.00071 black
11            Identify.Ethnically    0.00239       0.00052 black
12                 Discrimination    0.00230       0.00046 black
13               Dental.Insurance    0.00208       0.00031 black
14                       Religion    0.00189       0.00069 black
15                  Income_median    0.00188       0.00032 black
16               Health.Insurance    0.00129       0.00018 black
17 Familiarity.with.Ethnic.Origin    0.00092       0.00046 black
18       Familiarity.with.America    0.00068       0.00048 black
importance_plot <- ggplot(vi, aes(x = reorder(Variable, Importance), y = Importance, fill=fill))+
  geom_bar(stat = "identity",alpha=0.4) +
  geom_errorbar(aes(ymin=Importance-sd_Importance, ymax = Importance+sd_Importance))+
  
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
  scale_fill_manual(values=c("black","red"),
                    guide="none")
  
plot(importance_plot)

ggsave(filename = "VSURF_importance_family.png", width=8, height=12,units="in")

Logistic regression (Interpretation)

lr <- rfdata |> select(Family,names(rfdata[,-1])[vsurf.mod$varselect.pred])

lr_mod <- glm(Family~.,family=binomial,data=lr)
summary(lr_mod)

Call:
glm(formula = Family ~ ., family = binomial, data = lr)

Coefficients:
                               Estimate Std. Error z value Pr(>|z|)    
(Intercept)                    1.039877   0.164542   6.320 2.62e-10 ***
Age                           -0.013484   0.002900  -4.650 3.32e-06 ***
EthnicityAsian Indian         -0.445283   0.138338  -3.219 0.001287 ** 
EthnicityFilipino             -0.279527   0.175999  -1.588 0.112234    
EthnicityKorean               -0.648903   0.142419  -4.556 5.21e-06 ***
EthnicityOther                -0.851280   0.220476  -3.861 0.000113 ***
EthnicityVietnamese           -0.729705   0.142115  -5.135 2.83e-07 ***
English.DifficultiesMuch       0.458596   0.134518   3.409 0.000652 ***
English.DifficultiesNot much  -0.001948   0.122826  -0.016 0.987345    
English.DifficultiesVery much -0.270289   0.133988  -2.017 0.043668 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2735.1  on 1974  degrees of freedom
Residual deviance: 2655.2  on 1965  degrees of freedom
AIC: 2675.2

Number of Fisher Scoring iterations: 4
car::Anova(lr_mod)
Analysis of Deviance Table (Type II tests)

Response: Family
                     LR Chisq Df Pr(>Chisq)    
Age                    21.896  1  2.878e-06 ***
Ethnicity              38.682  5  2.752e-07 ***
English.Difficulties   24.968  3  1.568e-05 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
broom::tidy(lr_mod,exponentiate=T,conf.int=T) |> DT::datatable()

CV LASSO

x <- makeX(rfdata[,-1])
y <- rfdata$Family


cv.glmnet(x,y,family="binomial", type.measure = "auc")-> modcv
modcv$lambda.1se
[1] 0.01749015
coef(modcv,s="lambda.1se")
55 x 1 sparse Matrix of class "dgCMatrix"
                                                   s1
(Intercept)                              5.081756e-01
EthnicityChinese                         3.446551e-01
EthnicityAsian Indian                    .           
EthnicityFilipino                        2.000085e-02
EthnicityKorean                          .           
EthnicityOther                          -4.435060e-02
EthnicityVietnamese                     -5.947637e-02
Age                                     -6.976510e-03
GenderFemale                            -8.492122e-02
GenderMale                               6.220380e-15
ReligionNone                             .           
ReligionBuddhist                         .           
ReligionCatholic                         .           
ReligionHindu                            .           
ReligionMuslim                           .           
ReligionOther                            .           
ReligionProtestant                       .           
Full.Time.Employment0                    2.059562e-01
Full.Time.EmploymentEmployed full time  -3.548327e-04
Income_medianBelow                       .           
Income_medianAbove                       .           
US.BornNo                               -4.251606e-01
US.BornYes                               .           
Duration.of.Residency                   -1.110031e-03
Primary.Language0                        .           
Primary.Language1                        .           
English.SpeakingNot at all               .           
English.SpeakingNot well                 .           
English.SpeakingVery well                .           
English.SpeakingWell                     .           
English.DifficultiesNot at all           .           
English.DifficultiesMuch                 3.079440e-01
English.DifficultiesNot much             .           
English.DifficultiesVery much           -9.182944e-03
Familiarity.with.AmericaVery low         .           
Familiarity.with.AmericaLow              .           
Familiarity.with.AmericaHigh             .           
Familiarity.with.AmericaVery high        .           
Familiarity.with.Ethnic.OriginVery low   .           
Familiarity.with.Ethnic.OriginLow        .           
Familiarity.with.Ethnic.OriginHigh       .           
Familiarity.with.Ethnic.OriginVery high  .           
Identify.EthnicallyNot at all           -4.858230e-01
Identify.EthnicallyNot very close        .           
Identify.EthnicallySomewhat close        .           
Identify.EthnicallyVery close            .           
BelongingNot at all                     -1.371431e-04
BelongingNot very much                   .           
BelongingSomewhat                        .           
BelongingVery much                       1.028260e-02
Discrimination                           1.220443e-01
Health.Insurance0                        .           
Health.InsuranceYes                      .           
Dental.Insurance0                        7.624372e-02
Dental.InsuranceYes                     -4.376660e-14
print(modcv)

Call:  cv.glmnet(x = x, y = y, type.measure = "auc", family = "binomial") 

Measure: AUC 

      Lambda Index Measure       SE Nonzero
min 0.005727    26  0.6277 0.009836      33
1se 0.017490    14  0.6184 0.010205      19

Health Professionals

rfdata <- qol |> select(`Heal Professionals`, Ethnicity, Age, Gender,Religion, `Full Time Employment`,  Income_median, `US Born`:`Discrimination`,`Health Insurance`,`Dental Insurance`) |> 
    na.omit() |>
  as.data.frame() |> 
  rename_with(make.names)

imbal <- ROSE::ROSE(Heal.Professionals~.,
                          data=rfdata,
                          seed=3)$data

# VSURF(Heal.Professionals~.,imbal,na.action="na.omit",parallel=T,verbose=F)->vsurf.mod
VSURF(Heal.Professionals~.,rfdata,na.action="na.omit",parallel=T,verbose=F)->vsurf.mod
Warning in VSURF.formula(Heal.Professionals ~ ., rfdata, na.action = "na.omit", : VSURF with a formula-type call outputs selected variables
which are indices of the input matrix based on the formula:
you may reorder these to get indices of the original data
vsurf.mod |> summary()

 VSURF computation time: 13.3 secs 

 VSURF selected: 
    15 variables at thresholding step (in 6.4 secs)
    6 variables at interpretation step (in 2.8 secs)
    3 variables at prediction step (in 4.1 secs)

 VSURF ran in parallel on a PSOCK cluster and used 15 cores 
names(rfdata[,-1])[vsurf.mod$varselect.pred]
[1] "Duration.of.Residency" "English.Speaking"      "English.Difficulties" 
names(rfdata[,-1])[vsurf.mod$varselect.interp]
[1] "Duration.of.Residency" "English.Speaking"      "English.Difficulties" 
[4] "Dental.Insurance"      "Health.Insurance"      "Ethnicity"            
plot(vsurf.mod)

vsurf.mod$mean.perf
[1] 0.3803138

Importance

vi<- data.frame(Variable=names(rfdata[,-1])[vsurf.mod$imp.mean.dec.ind],
                Importance = vsurf.mod$imp.mean.dec,
                sd_Importance = vsurf.mod$imp.sd.dec
)|> 
  mutate(fill = case_when(Variable=="Ethnicity"~"red",
                                                 .default="black"))

vi |> mutate(across(Importance:sd_Importance,~round(.x,5)))
                         Variable Importance sd_Importance  fill
1           Duration.of.Residency    0.01576       0.00120 black
2                English.Speaking    0.01511       0.00108 black
3            English.Difficulties    0.00605       0.00079 black
4                Dental.Insurance    0.00565       0.00035 black
5                Health.Insurance    0.00533       0.00036 black
6                       Ethnicity    0.00510       0.00069   red
7                        Religion    0.00481       0.00070 black
8                             Age    0.00441       0.00059 black
9                   Income_median    0.00311       0.00042 black
10       Familiarity.with.America    0.00287       0.00053 black
11                 Discrimination    0.00245       0.00038 black
12 Familiarity.with.Ethnic.Origin    0.00231       0.00069 black
13            Identify.Ethnically    0.00211       0.00052 black
14               Primary.Language    0.00110       0.00043 black
15                      Belonging    0.00078       0.00050 black
16           Full.Time.Employment    0.00031       0.00039 black
17                        US.Born   -0.00023       0.00019 black
18                         Gender   -0.00030       0.00038 black
importance_plot <- ggplot(vi, aes(x = reorder(Variable, Importance), y = Importance, fill=fill))+
  geom_bar(stat = "identity",alpha=0.4) +
  geom_errorbar(aes(ymin=Importance-sd_Importance, ymax = Importance+sd_Importance))+
  
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
  scale_fill_manual(values=c("black","red"),
                    guide="none")
  
plot(importance_plot)

ggsave(filename = "VSURF_importance_hp.png", width=12, height=8,units="in")

Logistic regression (Interpretation)

lr <- rfdata |> select(Heal.Professionals,names(rfdata[,-1])[vsurf.mod$varselect.pred])

lr_mod <- glm(Heal.Professionals~.,family=binomial,data=lr)
summary(lr_mod)

Call:
glm(formula = Heal.Professionals ~ ., family = binomial, data = lr)

Coefficients:
                               Estimate Std. Error z value Pr(>|z|)    
(Intercept)                   -0.884624   0.251110  -3.523 0.000427 ***
Duration.of.Residency          0.028505   0.004013   7.102 1.23e-12 ***
English.SpeakingNot well       0.227614   0.251086   0.907 0.364662    
English.SpeakingVery well      1.064288   0.242697   4.385 1.16e-05 ***
English.SpeakingWell           0.762795   0.241370   3.160 0.001576 ** 
English.DifficultiesMuch      -0.361097   0.159166  -2.269 0.023288 *  
English.DifficultiesNot much  -0.267057   0.147068  -1.816 0.069389 .  
English.DifficultiesVery much -0.275771   0.139962  -1.970 0.048801 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2736.5  on 1975  degrees of freedom
Residual deviance: 2557.1  on 1968  degrees of freedom
AIC: 2573.1

Number of Fisher Scoring iterations: 4
car::Anova(lr_mod)
Analysis of Deviance Table (Type II tests)

Response: Heal.Professionals
                      LR Chisq Df Pr(>Chisq)    
Duration.of.Residency   52.489  1  4.327e-13 ***
English.Speaking        42.704  3  2.844e-09 ***
English.Difficulties     6.501  3    0.08962 .  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
broom::tidy(lr_mod,exponentiate=T,conf.int=T)|> DT::datatable()

CV LASSO

x <- makeX(rfdata[,-1])
y <- rfdata$Heal.Professionals


cv.glmnet(x,y,family="binomial", type.measure = "auc")-> modcv
modcv$lambda.1se
[1] 0.0289948
coef(modcv,s="lambda.1se")
55 x 1 sparse Matrix of class "dgCMatrix"
                                                   s1
(Intercept)                             -1.868095e-01
EthnicityChinese                         .           
EthnicityAsian Indian                    .           
EthnicityFilipino                        1.763755e-01
EthnicityKorean                         -7.409358e-02
EthnicityOther                           .           
EthnicityVietnamese                      .           
Age                                      .           
GenderFemale                             .           
GenderMale                               .           
ReligionNone                             .           
ReligionBuddhist                         .           
ReligionCatholic                         .           
ReligionHindu                            .           
ReligionMuslim                           .           
ReligionOther                            .           
ReligionProtestant                       .           
Full.Time.Employment0                    .           
Full.Time.EmploymentEmployed full time   .           
Income_medianBelow                       .           
Income_medianAbove                       .           
US.BornNo                                .           
US.BornYes                               .           
Duration.of.Residency                    1.906691e-02
Primary.Language0                        .           
Primary.Language1                        .           
English.SpeakingNot at all               .           
English.SpeakingNot well                -2.112454e-01
English.SpeakingVery well                1.940132e-01
English.SpeakingWell                     .           
English.DifficultiesNot at all           1.617571e-01
English.DifficultiesMuch                 .           
English.DifficultiesNot much             .           
English.DifficultiesVery much            .           
Familiarity.with.AmericaVery low         .           
Familiarity.with.AmericaLow              .           
Familiarity.with.AmericaHigh             .           
Familiarity.with.AmericaVery high        8.675712e-02
Familiarity.with.Ethnic.OriginVery low  -8.681946e-02
Familiarity.with.Ethnic.OriginLow        .           
Familiarity.with.Ethnic.OriginHigh       .           
Familiarity.with.Ethnic.OriginVery high  .           
Identify.EthnicallyNot at all            .           
Identify.EthnicallyNot very close        .           
Identify.EthnicallySomewhat close        .           
Identify.EthnicallyVery close            .           
BelongingNot at all                      .           
BelongingNot very much                   .           
BelongingSomewhat                        .           
BelongingVery much                       .           
Discrimination                           .           
Health.Insurance0                       -3.599801e-01
Health.InsuranceYes                      1.102124e-13
Dental.Insurance0                       -2.204331e-01
Dental.InsuranceYes                      6.410888e-14
print(modcv)

Call:  cv.glmnet(x = x, y = y, type.measure = "auc", family = "binomial") 

Measure: AUC 

     Lambda Index Measure       SE Nonzero
min 0.01042    26  0.6903 0.006749      23
1se 0.02899    15  0.6846 0.007234      12

Physical Check-up

#install.packages("randomForestSRC)

rfdata <- qol |> 
  select(`Physical Check-up`,Ethnicity, Age, Gender,Religion, `Full Time Employment`,  Income_median, `US Born`:`Discrimination`,`Health Insurance`,`Dental Insurance`) %>%
  na.omit() |> 
  rename(Employment=`Full Time Employment`,
         EnglishSpeak=`English Speaking`,
         EnglishDiff=`English Difficulties`) |> 
  as.data.frame() |> 
  rename_with(make.names)

imbal <- ROSE::ROSE(Physical.Check.up~.,
                          data=rfdata,
                          seed=3)$data

# VSURF(Physical.Check.up~.,imbal,na.action="na.omit",parallel=T,verbose=F)->vsurf.mod
VSURF(Physical.Check.up~.,rfdata,na.action="na.omit",parallel=T,verbose=F)->vsurf.mod
Warning in VSURF.formula(Physical.Check.up ~ ., rfdata, na.action = "na.omit", : VSURF with a formula-type call outputs selected variables
which are indices of the input matrix based on the formula:
you may reorder these to get indices of the original data
vsurf.mod |> summary()

 VSURF computation time: 11.4 secs 

 VSURF selected: 
    17 variables at thresholding step (in 5.7 secs)
    5 variables at interpretation step (in 3.4 secs)
    2 variables at prediction step (in 2.4 secs)

 VSURF ran in parallel on a PSOCK cluster and used 15 cores 
names(rfdata[,-1])[vsurf.mod$varselect.pred]
[1] "Duration.of.Residency" "Health.Insurance"     
names(rfdata[,-1])[vsurf.mod$varselect.interp]
[1] "Duration.of.Residency" "Age"                   "Health.Insurance"     
[4] "Dental.Insurance"      "Income_median"        
plot(vsurf.mod)

vsurf.mod$mean.perf
[1] 0.2829858

Importance

vi<- data.frame(Variable=names(rfdata[,-1])[vsurf.mod$imp.mean.dec.ind],
                Importance = vsurf.mod$imp.mean.dec,
                sd_Importance = vsurf.mod$imp.sd.dec
)|> 
  mutate(fill = case_when(Variable=="Ethnicity"~"red",
                                                 .default="black"))

vi |> mutate(across(Importance:sd_Importance,~round(.x,5)))
                         Variable Importance sd_Importance  fill
1           Duration.of.Residency    0.02969       0.00074 black
2                             Age    0.02669       0.00075 black
3                Health.Insurance    0.02241       0.00063 black
4                Dental.Insurance    0.01358       0.00074 black
5                   Income_median    0.00725       0.00045 black
6                       Ethnicity    0.00676       0.00073   red
7                    EnglishSpeak    0.00532       0.00052 black
8                      Employment    0.00473       0.00036 black
9                     EnglishDiff    0.00439       0.00050 black
10                       Religion    0.00421       0.00052 black
11       Familiarity.with.America    0.00244       0.00049 black
12                         Gender    0.00233       0.00044 black
13 Familiarity.with.Ethnic.Origin    0.00231       0.00038 black
14               Primary.Language    0.00107       0.00030 black
15                        US.Born    0.00096       0.00014 black
16                 Discrimination    0.00061       0.00027 black
17            Identify.Ethnically    0.00053       0.00037 black
18                      Belonging    0.00016       0.00038 black
importance_plot <- ggplot(vi, aes(x = reorder(Variable, Importance), y = Importance, fill=fill))+
  geom_bar(stat = "identity",alpha=0.4) +
  geom_errorbar(aes(ymin=Importance-sd_Importance, ymax = Importance+sd_Importance))+
  
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
  scale_fill_manual(values=c("black","red"),
                    guide="none")
  
plot(importance_plot)

ggsave(filename = "VSURF_importance_PC.png", width=12, height=8,units="in")

Logistic regression (Interpretation)

lr <- rfdata |> select(Physical.Check.up,names(rfdata[,-1])[vsurf.mod$varselect.interp])

lr_mod <- glm(Physical.Check.up~.,family=binomial,data=lr)
summary(lr_mod)

Call:
glm(formula = Physical.Check.up ~ ., family = binomial, data = lr)

Coefficients:
                       Estimate Std. Error z value Pr(>|z|)    
(Intercept)           -1.928578   0.205890  -9.367  < 2e-16 ***
Duration.of.Residency  0.033459   0.005215   6.415 1.40e-10 ***
Age                    0.017649   0.003775   4.675 2.94e-06 ***
Health.InsuranceYes    1.169682   0.159911   7.315 2.58e-13 ***
Dental.InsuranceYes    0.647069   0.123481   5.240 1.60e-07 ***
Income_medianAbove     0.270055   0.114061   2.368   0.0179 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2466.2  on 1965  degrees of freedom
Residual deviance: 2152.4  on 1960  degrees of freedom
AIC: 2164.4

Number of Fisher Scoring iterations: 4
car::Anova(lr_mod)
Analysis of Deviance Table (Type II tests)

Response: Physical.Check.up
                      LR Chisq Df Pr(>Chisq)    
Duration.of.Residency   43.282  1  4.738e-11 ***
Age                     22.227  1  2.423e-06 ***
Health.Insurance        55.729  1  8.318e-14 ***
Dental.Insurance        27.267  1  1.772e-07 ***
Income_median            5.582  1    0.01815 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
broom::tidy(lr_mod,exponentiate=T,conf.int=T)|> 
  mutate(across(estimate:conf.high,~round(.x,3))) |> 
  DT::datatable()

CV LASSO

x <- makeX(rfdata[,-1])
y <- rfdata$Physical.Check.up


cv.glmnet(x,y,family="binomial", type.measure = "auc")-> modcv
modcv$lambda.1se
[1] 0.03886821
coef(modcv,s="lambda.1se")
55 x 1 sparse Matrix of class "dgCMatrix"
                                                   s1
(Intercept)                              4.458627e-01
EthnicityChinese                         .           
EthnicityAsian Indian                    .           
EthnicityFilipino                        .           
EthnicityKorean                         -1.261200e-01
EthnicityOther                           .           
EthnicityVietnamese                      .           
Age                                      5.442383e-03
GenderFemale                             1.135173e-01
GenderMale                               .           
ReligionNone                             .           
ReligionBuddhist                         .           
ReligionCatholic                         .           
ReligionHindu                            .           
ReligionMuslim                           .           
ReligionOther                            .           
ReligionProtestant                       .           
Employment0                              .           
EmploymentEmployed full time             .           
Income_medianBelow                      -8.947533e-03
Income_medianAbove                       4.649018e-15
US.BornNo                                .           
US.BornYes                               .           
Duration.of.Residency                    2.328732e-02
Primary.Language0                        .           
Primary.Language1                        .           
EnglishSpeakNot at all                   .           
EnglishSpeakNot well                     .           
EnglishSpeakVery well                    .           
EnglishSpeakWell                         .           
EnglishDiffNot at all                    .           
EnglishDiffMuch                          .           
EnglishDiffNot much                      .           
EnglishDiffVery much                     .           
Familiarity.with.AmericaVery low         .           
Familiarity.with.AmericaLow              .           
Familiarity.with.AmericaHigh             .           
Familiarity.with.AmericaVery high        .           
Familiarity.with.Ethnic.OriginVery low   .           
Familiarity.with.Ethnic.OriginLow        .           
Familiarity.with.Ethnic.OriginHigh       .           
Familiarity.with.Ethnic.OriginVery high  .           
Identify.EthnicallyNot at all            .           
Identify.EthnicallyNot very close        .           
Identify.EthnicallySomewhat close        .           
Identify.EthnicallyVery close            .           
BelongingNot at all                      .           
BelongingNot very much                   .           
BelongingSomewhat                        .           
BelongingVery much                       .           
Discrimination                           .           
Health.Insurance0                       -8.297677e-01
Health.InsuranceYes                      .           
Dental.Insurance0                       -4.033314e-01
Dental.InsuranceYes                      .           
print(modcv)

Call:  cv.glmnet(x = x, y = y, type.measure = "auc", family = "binomial") 

Measure: AUC 

     Lambda Index Measure      SE Nonzero
min 0.00877    30  0.7559 0.01248      28
1se 0.03887    14  0.7437 0.01008       8

Boruta

set.seed(123)

Boruta(Physical.Check.up~.,
       data=rfdata,
       doTrace=2,
       ) ->pc_boruta
 1. run of importance source...
 2. run of importance source...
 3. run of importance source...
 4. run of importance source...
 5. run of importance source...
 6. run of importance source...
 7. run of importance source...
 8. run of importance source...
 9. run of importance source...
 10. run of importance source...
 11. run of importance source...
After 11 iterations, +2.7 secs: 
 confirmed 10 attributes: Age, Dental.Insurance, Duration.of.Residency, Employment, EnglishDiff and 5 more;
 still have 8 attributes left.
 12. run of importance source...
 13. run of importance source...
 14. run of importance source...
 15. run of importance source...
After 15 iterations, +3.6 secs: 
 rejected 2 attributes: Discrimination, Identify.Ethnically;
 still have 6 attributes left.
 16. run of importance source...
 17. run of importance source...
 18. run of importance source...
 19. run of importance source...
After 19 iterations, +4.4 secs: 
 confirmed 1 attribute: Familiarity.with.America;
 still have 5 attributes left.
 20. run of importance source...
 21. run of importance source...
 22. run of importance source...
 23. run of importance source...
 24. run of importance source...
 25. run of importance source...
 26. run of importance source...
 27. run of importance source...
 28. run of importance source...
 29. run of importance source...
 30. run of importance source...
 31. run of importance source...
 32. run of importance source...
 33. run of importance source...
 34. run of importance source...
After 34 iterations, +7.4 secs: 
 rejected 1 attribute: Belonging;
 still have 4 attributes left.
 35. run of importance source...
 36. run of importance source...
 37. run of importance source...
 38. run of importance source...
 39. run of importance source...
 40. run of importance source...
 41. run of importance source...
 42. run of importance source...
 43. run of importance source...
 44. run of importance source...
 45. run of importance source...
 46. run of importance source...
 47. run of importance source...
After 47 iterations, +9.9 secs: 
 confirmed 1 attribute: Primary.Language;
 still have 3 attributes left.
 48. run of importance source...
 49. run of importance source...
 50. run of importance source...
 51. run of importance source...
 52. run of importance source...
 53. run of importance source...
 54. run of importance source...
 55. run of importance source...
 56. run of importance source...
 57. run of importance source...
 58. run of importance source...
 59. run of importance source...
 60. run of importance source...
 61. run of importance source...
 62. run of importance source...
 63. run of importance source...
 64. run of importance source...
 65. run of importance source...
 66. run of importance source...
 67. run of importance source...
 68. run of importance source...
 69. run of importance source...
 70. run of importance source...
 71. run of importance source...
 72. run of importance source...
 73. run of importance source...
After 73 iterations, +15 secs: 
 confirmed 1 attribute: Religion;
 still have 2 attributes left.
 74. run of importance source...
 75. run of importance source...
 76. run of importance source...
 77. run of importance source...
 78. run of importance source...
 79. run of importance source...
 80. run of importance source...
 81. run of importance source...
 82. run of importance source...
 83. run of importance source...
 84. run of importance source...
 85. run of importance source...
 86. run of importance source...
 87. run of importance source...
 88. run of importance source...
 89. run of importance source...
 90. run of importance source...
 91. run of importance source...
 92. run of importance source...
After 92 iterations, +18 secs: 
 confirmed 1 attribute: Familiarity.with.Ethnic.Origin;
 still have 1 attribute left.
 93. run of importance source...
 94. run of importance source...
 95. run of importance source...
 96. run of importance source...
 97. run of importance source...
 98. run of importance source...
 99. run of importance source...
plot(pc_boruta,las=2,cex.axis=0.5)

pc_boruta
Boruta performed 99 iterations in 19.78841 secs.
 14 attributes confirmed important: Age, Dental.Insurance,
Duration.of.Residency, Employment, EnglishDiff and 9 more;
 3 attributes confirmed unimportant: Belonging, Discrimination,
Identify.Ethnically;
 1 tentative attributes left: US.Born;
pc_boruta$finalDecision
                     Ethnicity                            Age 
                     Confirmed                      Confirmed 
                        Gender                       Religion 
                     Confirmed                      Confirmed 
                    Employment                  Income_median 
                     Confirmed                      Confirmed 
                       US.Born          Duration.of.Residency 
                     Tentative                      Confirmed 
              Primary.Language                   EnglishSpeak 
                     Confirmed                      Confirmed 
                   EnglishDiff       Familiarity.with.America 
                     Confirmed                      Confirmed 
Familiarity.with.Ethnic.Origin            Identify.Ethnically 
                     Confirmed                       Rejected 
                     Belonging                 Discrimination 
                      Rejected                       Rejected 
              Health.Insurance               Dental.Insurance 
                     Confirmed                      Confirmed 
Levels: Tentative Confirmed Rejected

Naive Bayes Classifier

fitControl <- trainControl(## 10-fold CV
                           method = "repeatedcv",
                           number = 10,
                           ## repeated ten times
                           repeats = 10)


cl <- makePSOCKcluster(10)
registerDoParallel(cl)
set.seed(825)
rf_fit <- train(Physical.Check.up ~ ., data = rfdata, 
                 method = "naive_bayes", 
                 trControl = fitControl,
                 ## This last option is actually one
                 ## for gbm() that passes through
                 verbose = FALSE)

stopCluster(cl)
varImp(rf_fit)
ROC curve variable importance

                               Importance
Duration.of.Residency             100.000
Dental.Insurance                   75.670
Age                                59.640
Health.Insurance                   58.546
Income_median                      48.264
EnglishDiff                        36.790
Familiarity.with.America           32.617
Gender                             31.000
Belonging                          27.681
Primary.Language                   25.326
Identify.Ethnically                25.209
EnglishSpeak                       25.023
Employment                         23.596
Familiarity.with.Ethnic.Origin     21.398
Discrimination                     17.250
Ethnicity                           7.396
Religion                            5.464
US.Born                             0.000

Dental Check-up

rfdata <- qol |> select(`Dentist Check-up`,Ethnicity, Age, Gender,Religion, `Full Time Employment`,  Income_median, `US Born`:`Discrimination`,`Health Insurance`,`Dental Insurance`) |> 
    na.omit() |>
  as.data.frame() |> 
  rename_with(make.names)

imbal <- ROSE::ROSE(Dentist.Check.up~.,
                          data=rfdata,
                          seed=3)$data

# VSURF(Dentist.Check.up~.,imbal,na.action="na.omit",parallel=T,verbose=F)->vsurf.mod
VSURF(Dentist.Check.up~.,rfdata,na.action="na.omit",parallel=T,verbose=F)->vsurf.mod
Warning in VSURF.formula(Dentist.Check.up ~ ., rfdata, na.action = "na.omit", : VSURF with a formula-type call outputs selected variables
which are indices of the input matrix based on the formula:
you may reorder these to get indices of the original data
vsurf.mod |> summary()

 VSURF computation time: 10.6 secs 

 VSURF selected: 
    18 variables at thresholding step (in 5.8 secs)
    3 variables at interpretation step (in 3.5 secs)
    3 variables at prediction step (in 1.4 secs)

 VSURF ran in parallel on a PSOCK cluster and used 15 cores 
names(rfdata[,-1])[vsurf.mod$varselect.pred]
[1] "Dental.Insurance"      "Duration.of.Residency" "Religion"             
names(rfdata[,-1])[vsurf.mod$varselect.interp]
[1] "Dental.Insurance"      "Duration.of.Residency" "Religion"             
plot(vsurf.mod)

vsurf.mod$mean.perf
[1] 0.2955125

Importance

vi<- data.frame(Variable=names(rfdata[,-1])[vsurf.mod$imp.mean.dec.ind],
                Importance = vsurf.mod$imp.mean.dec,
                sd_Importance = vsurf.mod$imp.sd.dec
)|> 
  mutate(fill = case_when(Variable=="Ethnicity"~"red",
                                                 .default="black"))

vi |> mutate(across(Importance:sd_Importance,~round(.x,5)))
                         Variable Importance sd_Importance  fill
1                Dental.Insurance    0.05183       0.00065 black
2           Duration.of.Residency    0.04928       0.00068 black
3                        Religion    0.01610       0.00076 black
4                       Ethnicity    0.01342       0.00088   red
5                             Age    0.01150       0.00060 black
6                Health.Insurance    0.00817       0.00042 black
7                English.Speaking    0.00766       0.00045 black
8                   Income_median    0.00676       0.00050 black
9             Identify.Ethnically    0.00378       0.00032 black
10           English.Difficulties    0.00325       0.00042 black
11       Familiarity.with.America    0.00286       0.00042 black
12 Familiarity.with.Ethnic.Origin    0.00282       0.00047 black
13                      Belonging    0.00243       0.00046 black
14           Full.Time.Employment    0.00215       0.00035 black
15                         Gender    0.00199       0.00035 black
16                        US.Born    0.00178       0.00019 black
17               Primary.Language    0.00178       0.00028 black
18                 Discrimination    0.00079       0.00025 black
importance_plot <- ggplot(vi, aes(x = reorder(Variable, Importance), y = Importance, fill=fill))+
  geom_bar(stat = "identity",alpha=0.4) +
  geom_errorbar(aes(ymin=Importance-sd_Importance, ymax = Importance+sd_Importance))+
  
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
  scale_fill_manual(values=c("black","red"),
                    guide="none")
  
plot(importance_plot)

ggsave(filename = "VSURF_importance_Dc.png", width=12, height=8,units="in")

Logistic regression (Interpretation)

lr <- rfdata |> select(Dentist.Check.up,names(rfdata[,-1])[vsurf.mod$varselect.interp])

lr_mod <- glm(Dentist.Check.up~.,family=binomial,data=lr)
summary(lr_mod)

Call:
glm(formula = Dentist.Check.up ~ ., family = binomial, data = lr)

Coefficients:
                       Estimate Std. Error z value Pr(>|z|)    
(Intercept)           -1.120303   0.142888  -7.840 4.49e-15 ***
Dental.InsuranceYes    1.631483   0.107046  15.241  < 2e-16 ***
Duration.of.Residency  0.045181   0.004657   9.701  < 2e-16 ***
ReligionBuddhist       0.139163   0.189583   0.734 0.462920    
ReligionCatholic       0.012325   0.170281   0.072 0.942301    
ReligionHindu         -0.654930   0.168546  -3.886 0.000102 ***
ReligionMuslim        -0.538116   0.338322  -1.591 0.111712    
ReligionOther         -0.678784   0.408196  -1.663 0.096335 .  
ReligionProtestant    -0.122292   0.156091  -0.783 0.433356    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2659.6  on 1960  degrees of freedom
Residual deviance: 2209.7  on 1952  degrees of freedom
AIC: 2227.7

Number of Fisher Scoring iterations: 4
car::Anova(lr_mod)
Analysis of Deviance Table (Type II tests)

Response: Dentist.Check.up
                      LR Chisq Df Pr(>Chisq)    
Dental.Insurance       251.011  1  < 2.2e-16 ***
Duration.of.Residency  102.856  1  < 2.2e-16 ***
Religion                27.422  6  0.0001207 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
broom::tidy(lr_mod,exponentiate=T,conf.int=T)|>
  mutate(across(estimate:conf.high,~round(.x,3))) |> 
  DT::datatable()

CV LASSO

x <- makeX(rfdata[,-1])
y <- rfdata$Dentist.Check.up


cv.glmnet(x,y,family="binomial", type.measure = "auc")-> modcv
modcv$lambda.1se
[1] 0.1271423
coef(modcv,s="lambda.1se")
55 x 1 sparse Matrix of class "dgCMatrix"
                                                   s1
(Intercept)                              4.898908e-01
EthnicityChinese                         .           
EthnicityAsian Indian                    .           
EthnicityFilipino                        .           
EthnicityKorean                          .           
EthnicityOther                           .           
EthnicityVietnamese                      .           
Age                                      .           
GenderFemale                             .           
GenderMale                               .           
ReligionNone                             .           
ReligionBuddhist                         .           
ReligionCatholic                         .           
ReligionHindu                            .           
ReligionMuslim                           .           
ReligionOther                            .           
ReligionProtestant                       .           
Full.Time.Employment0                    .           
Full.Time.EmploymentEmployed full time   .           
Income_medianBelow                       .           
Income_medianAbove                       .           
US.BornNo                                .           
US.BornYes                               .           
Duration.of.Residency                    3.132535e-03
Primary.Language0                        .           
Primary.Language1                        .           
English.SpeakingNot at all               .           
English.SpeakingNot well                 .           
English.SpeakingVery well                .           
English.SpeakingWell                     .           
English.DifficultiesNot at all           .           
English.DifficultiesMuch                 .           
English.DifficultiesNot much             .           
English.DifficultiesVery much            .           
Familiarity.with.AmericaVery low         .           
Familiarity.with.AmericaLow              .           
Familiarity.with.AmericaHigh             .           
Familiarity.with.AmericaVery high        .           
Familiarity.with.Ethnic.OriginVery low   .           
Familiarity.with.Ethnic.OriginLow        .           
Familiarity.with.Ethnic.OriginHigh       .           
Familiarity.with.Ethnic.OriginVery high  .           
Identify.EthnicallyNot at all            .           
Identify.EthnicallyNot very close        .           
Identify.EthnicallySomewhat close        .           
Identify.EthnicallyVery close            .           
BelongingNot at all                      .           
BelongingNot very much                   .           
BelongingSomewhat                        .           
BelongingVery much                       .           
Discrimination                           .           
Health.Insurance0                        .           
Health.InsuranceYes                      .           
Dental.Insurance0                       -4.719636e-01
Dental.InsuranceYes                      1.567321e-13
print(modcv)

Call:  cv.glmnet(x = x, y = y, type.measure = "auc", family = "binomial") 

Measure: AUC 

     Lambda Index Measure      SE Nonzero
min 0.01363    29  0.7788 0.01055      20
1se 0.12714     5  0.7682 0.01208       3

Boruta

set.seed(123)

Boruta(Dentist.Check.up~.,
       data=rfdata,
       doTrace=2,
       ) ->pc_boruta
 1. run of importance source...
 2. run of importance source...
 3. run of importance source...
 4. run of importance source...
 5. run of importance source...
 6. run of importance source...
 7. run of importance source...
 8. run of importance source...
 9. run of importance source...
 10. run of importance source...
 11. run of importance source...
After 11 iterations, +2.4 secs: 
 confirmed 11 attributes: Age, Dental.Insurance, Duration.of.Residency, English.Speaking, Ethnicity and 6 more;
 still have 7 attributes left.
 12. run of importance source...
 13. run of importance source...
 14. run of importance source...
 15. run of importance source...
After 15 iterations, +3.3 secs: 
 confirmed 2 attributes: English.Difficulties, Primary.Language;
 still have 5 attributes left.
 16. run of importance source...
 17. run of importance source...
 18. run of importance source...
 19. run of importance source...
 20. run of importance source...
 21. run of importance source...
 22. run of importance source...
 23. run of importance source...
 24. run of importance source...
 25. run of importance source...
 26. run of importance source...
 27. run of importance source...
 28. run of importance source...
 29. run of importance source...
 30. run of importance source...
 31. run of importance source...
 32. run of importance source...
 33. run of importance source...
 34. run of importance source...
 35. run of importance source...
 36. run of importance source...
 37. run of importance source...
 38. run of importance source...
 39. run of importance source...
After 39 iterations, +8.6 secs: 
 rejected 1 attribute: Discrimination;
 still have 4 attributes left.
 40. run of importance source...
 41. run of importance source...
 42. run of importance source...
 43. run of importance source...
 44. run of importance source...
 45. run of importance source...
 46. run of importance source...
 47. run of importance source...
 48. run of importance source...
 49. run of importance source...
 50. run of importance source...
 51. run of importance source...
 52. run of importance source...
 53. run of importance source...
After 53 iterations, +11 secs: 
 rejected 1 attribute: Belonging;
 still have 3 attributes left.
 54. run of importance source...
 55. run of importance source...
 56. run of importance source...
 57. run of importance source...
 58. run of importance source...
 59. run of importance source...
 60. run of importance source...
 61. run of importance source...
 62. run of importance source...
 63. run of importance source...
 64. run of importance source...
 65. run of importance source...
 66. run of importance source...
 67. run of importance source...
 68. run of importance source...
 69. run of importance source...
 70. run of importance source...
 71. run of importance source...
 72. run of importance source...
 73. run of importance source...
 74. run of importance source...
 75. run of importance source...
After 75 iterations, +16 secs: 
 confirmed 1 attribute: Full.Time.Employment;
 rejected 1 attribute: Familiarity.with.Ethnic.Origin;
 still have 1 attribute left.
 76. run of importance source...
 77. run of importance source...
 78. run of importance source...
 79. run of importance source...
 80. run of importance source...
 81. run of importance source...
 82. run of importance source...
 83. run of importance source...
 84. run of importance source...
 85. run of importance source...
 86. run of importance source...
 87. run of importance source...
 88. run of importance source...
 89. run of importance source...
 90. run of importance source...
 91. run of importance source...
 92. run of importance source...
 93. run of importance source...
 94. run of importance source...
 95. run of importance source...
 96. run of importance source...
 97. run of importance source...
 98. run of importance source...
 99. run of importance source...
plot(pc_boruta,las=2,cex.axis=0.5)

pc_boruta
Boruta performed 99 iterations in 20.35846 secs.
 14 attributes confirmed important: Age, Dental.Insurance,
Duration.of.Residency, English.Difficulties, English.Speaking and 9
more;
 3 attributes confirmed unimportant: Belonging, Discrimination,
Familiarity.with.Ethnic.Origin;
 1 tentative attributes left: Identify.Ethnically;
pc_boruta$finalDecision
                     Ethnicity                            Age 
                     Confirmed                      Confirmed 
                        Gender                       Religion 
                     Confirmed                      Confirmed 
          Full.Time.Employment                  Income_median 
                     Confirmed                      Confirmed 
                       US.Born          Duration.of.Residency 
                     Confirmed                      Confirmed 
              Primary.Language               English.Speaking 
                     Confirmed                      Confirmed 
          English.Difficulties       Familiarity.with.America 
                     Confirmed                      Confirmed 
Familiarity.with.Ethnic.Origin            Identify.Ethnically 
                      Rejected                      Tentative 
                     Belonging                 Discrimination 
                      Rejected                       Rejected 
              Health.Insurance               Dental.Insurance 
                     Confirmed                      Confirmed 
Levels: Tentative Confirmed Rejected

Naive Bayes Classifier

fitControl <- trainControl(## 10-fold CV
                           method = "repeatedcv",
                           number = 10,
                           ## repeated ten times
                           repeats = 10)


cl <- makePSOCKcluster(10)
registerDoParallel(cl)
set.seed(825)
rf_fit <- train(Dentist.Check.up ~ ., data = rfdata, 
                 method = "naive_bayes", 
                 trControl = fitControl,
                 ## This last option is actually one
                 ## for gbm() that passes through
                 verbose = FALSE)

stopCluster(cl)
varImp(rf_fit)
ROC curve variable importance

                               Importance
Duration.of.Residency             100.000
Dental.Insurance                   96.639
Income_median                      53.830
Familiarity.with.America           49.128
Health.Insurance                   38.691
English.Difficulties               37.429
Primary.Language                   30.509
Age                                30.053
Gender                             26.747
Full.Time.Employment               21.299
English.Speaking                   19.358
Discrimination                     14.236
Identify.Ethnically                14.044
Religion                           13.809
Belonging                          12.067
US.Born                            11.577
Familiarity.with.Ethnic.Origin      4.759
Ethnicity                           0.000

Folkmedicine

rfdata <- qol |> select(`Folkmedicine`,Ethnicity, Age, Gender,Religion, `Full Time Employment`,  Income_median, `US Born`:`Discrimination`,`Health Insurance`,`Dental Insurance`) |> 
    na.omit() |>
  as.data.frame() |> 
  rename_with(make.names)

imbal <- ROSE::ROSE(Folkmedicine~.,
                          data=rfdata,
                          seed=3)$data

# VSURF(Folkmedicine~.,imbal,na.action="na.omit",parallel=T,verbose=F)->vsurf.mod
VSURF(Folkmedicine~.,rfdata,na.action="na.omit",parallel=T,verbose=F)->vsurf.mod
Warning in VSURF.formula(Folkmedicine ~ ., rfdata, na.action = "na.omit", : VSURF with a formula-type call outputs selected variables
which are indices of the input matrix based on the formula:
you may reorder these to get indices of the original data
vsurf.mod |> summary()

 VSURF computation time: 9.8 secs 

 VSURF selected: 
    18 variables at thresholding step (in 4.8 secs)
    3 variables at interpretation step (in 3.2 secs)
    2 variables at prediction step (in 1.8 secs)

 VSURF ran in parallel on a PSOCK cluster and used 15 cores 
names(rfdata[,-1])[vsurf.mod$varselect.pred]
[1] "Age"       "Ethnicity"
names(rfdata[,-1])[vsurf.mod$varselect.interp]
[1] "Age"                   "Duration.of.Residency" "Ethnicity"            
plot(vsurf.mod)

vsurf.mod$mean.perf
[1] 0.1380329

Importance

vi<- data.frame(Variable=names(rfdata[,-1])[vsurf.mod$imp.mean.dec.ind],
                Importance = vsurf.mod$imp.mean.dec,
                sd_Importance = vsurf.mod$imp.sd.dec
)|> 
  mutate(fill = case_when(Variable=="Ethnicity"~"red",
                                                 .default="black"))

vi |> mutate(across(Importance:sd_Importance,~round(.x,5)))
                         Variable Importance sd_Importance  fill
1                             Age    0.01728       0.00085 black
2           Duration.of.Residency    0.01354       0.00073 black
3                       Ethnicity    0.01305       0.00050   red
4                English.Speaking    0.00905       0.00062 black
5                        Religion    0.00703       0.00051 black
6                   Income_median    0.00559       0.00041 black
7            English.Difficulties    0.00472       0.00053 black
8                Primary.Language    0.00262       0.00043 black
9  Familiarity.with.Ethnic.Origin    0.00259       0.00022 black
10       Familiarity.with.America    0.00257       0.00040 black
11           Full.Time.Employment    0.00246       0.00032 black
12                 Discrimination    0.00126       0.00040 black
13               Dental.Insurance    0.00122       0.00024 black
14               Health.Insurance    0.00079       0.00015 black
15                        US.Born    0.00073       0.00023 black
16            Identify.Ethnically    0.00073       0.00034 black
17                         Gender    0.00059       0.00020 black
18                      Belonging    0.00040       0.00033 black
importance_plot <- ggplot(vi, aes(x = reorder(Variable, Importance), y = Importance, fill=fill))+
  geom_bar(stat = "identity",alpha=0.4) +
  geom_errorbar(aes(ymin=Importance-sd_Importance, ymax = Importance+sd_Importance))+
  
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
  scale_fill_manual(values=c("black","red"),
                    guide="none")
  
plot(importance_plot)

ggsave(filename = "VSURF_importance_Alt.png", width=12, height=8,units="in")

Logistic regression (Interpretation)

lr <- rfdata |> select(Folkmedicine,names(rfdata[,-1])[vsurf.mod$varselect.interp])

lr_mod <- glm(Folkmedicine~.,family=binomial,data=lr)
summary(lr_mod)

Call:
glm(formula = Folkmedicine ~ ., family = binomial, data = lr)

Coefficients:
                       Estimate Std. Error z value Pr(>|z|)    
(Intercept)           -2.612602   0.219844 -11.884  < 2e-16 ***
Age                    0.022611   0.004636   4.877 1.08e-06 ***
Duration.of.Residency  0.007417   0.005960   1.244 0.213371    
EthnicityAsian Indian -0.809613   0.218076  -3.713 0.000205 ***
EthnicityFilipino     -0.742468   0.272122  -2.728 0.006364 ** 
EthnicityKorean        0.234543   0.173006   1.356 0.175197    
EthnicityOther        -0.735426   0.357019  -2.060 0.039408 *  
EthnicityVietnamese   -1.084595   0.232545  -4.664 3.10e-06 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1552.9  on 1946  degrees of freedom
Residual deviance: 1450.9  on 1939  degrees of freedom
AIC: 1466.9

Number of Fisher Scoring iterations: 5
car::Anova(lr_mod)
Analysis of Deviance Table (Type II tests)

Response: Folkmedicine
                      LR Chisq Df Pr(>Chisq)    
Age                     23.485  1  1.259e-06 ***
Duration.of.Residency    1.544  1     0.2141    
Ethnicity               56.750  5  5.693e-11 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
broom::tidy(lr_mod,exponentiate=T,conf.int=T)|>
  mutate(across(estimate:conf.high,~round(.x,3))) |> 
  DT::datatable()

CV LASSO

x <- makeX(rfdata[,-1])
y <- rfdata$Folkmedicine


cv.glmnet(x,y,family="binomial", type.measure = "auc")-> modcv
modcv$lambda.1se
[1] 0.02076259
coef(modcv,s="lambda.1se")
55 x 1 sparse Matrix of class "dgCMatrix"
                                                   s1
(Intercept)                             -2.480267e+00
EthnicityChinese                         1.445335e-01
EthnicityAsian Indian                    .           
EthnicityFilipino                        .           
EthnicityKorean                          3.345835e-01
EthnicityOther                           .           
EthnicityVietnamese                     -8.517757e-02
Age                                      1.238592e-02
GenderFemale                             .           
GenderMale                               .           
ReligionNone                             .           
ReligionBuddhist                         .           
ReligionCatholic                         .           
ReligionHindu                            .           
ReligionMuslim                           .           
ReligionOther                            .           
ReligionProtestant                       1.573109e-01
Full.Time.Employment0                    2.915342e-02
Full.Time.EmploymentEmployed full time  -7.843686e-16
Income_medianBelow                       .           
Income_medianAbove                       .           
US.BornNo                                .           
US.BornYes                               .           
Duration.of.Residency                    1.609651e-03
Primary.Language0                        .           
Primary.Language1                        .           
English.SpeakingNot at all               .           
English.SpeakingNot well                 .           
English.SpeakingVery well               -2.241479e-01
English.SpeakingWell                     .           
English.DifficultiesNot at all           .           
English.DifficultiesMuch                 .           
English.DifficultiesNot much             .           
English.DifficultiesVery much            .           
Familiarity.with.AmericaVery low         .           
Familiarity.with.AmericaLow              .           
Familiarity.with.AmericaHigh             .           
Familiarity.with.AmericaVery high        .           
Familiarity.with.Ethnic.OriginVery low   .           
Familiarity.with.Ethnic.OriginLow        .           
Familiarity.with.Ethnic.OriginHigh       .           
Familiarity.with.Ethnic.OriginVery high  .           
Identify.EthnicallyNot at all            .           
Identify.EthnicallyNot very close        .           
Identify.EthnicallySomewhat close        .           
Identify.EthnicallyVery close            .           
BelongingNot at all                      .           
BelongingNot very much                   .           
BelongingSomewhat                        .           
BelongingVery much                       .           
Discrimination                           .           
Health.Insurance0                        .           
Health.InsuranceYes                      .           
Dental.Insurance0                        .           
Dental.InsuranceYes                      .           
print(modcv)

Call:  cv.glmnet(x = x, y = y, type.measure = "auc", family = "binomial") 

Measure: AUC 

      Lambda Index Measure      SE Nonzero
min 0.002943    32  0.6842 0.01542      33
1se 0.020763    11  0.6694 0.02331       9

Boruta

set.seed(123)

Boruta(Folkmedicine~.,
       data=rfdata,
       doTrace=2,
       ) ->pc_boruta
 1. run of importance source...
 2. run of importance source...
 3. run of importance source...
 4. run of importance source...
 5. run of importance source...
 6. run of importance source...
 7. run of importance source...
 8. run of importance source...
 9. run of importance source...
 10. run of importance source...
 11. run of importance source...
After 11 iterations, +2.1 secs: 
 confirmed 7 attributes: Age, Duration.of.Residency, English.Speaking, Ethnicity, Familiarity.with.America and 2 more;
 still have 11 attributes left.
 12. run of importance source...
 13. run of importance source...
 14. run of importance source...
 15. run of importance source...
After 15 iterations, +2.8 secs: 
 confirmed 2 attributes: Income_median, Religion;
 still have 9 attributes left.
 16. run of importance source...
 17. run of importance source...
 18. run of importance source...
 19. run of importance source...
After 19 iterations, +3.6 secs: 
 confirmed 1 attribute: English.Difficulties;
 rejected 1 attribute: Belonging;
 still have 7 attributes left.
 20. run of importance source...
 21. run of importance source...
 22. run of importance source...
After 22 iterations, +4.1 secs: 
 rejected 1 attribute: Gender;
 still have 6 attributes left.
 23. run of importance source...
 24. run of importance source...
 25. run of importance source...
After 25 iterations, +4.6 secs: 
 confirmed 1 attribute: Familiarity.with.Ethnic.Origin;
 still have 5 attributes left.
 26. run of importance source...
 27. run of importance source...
 28. run of importance source...
After 28 iterations, +5.1 secs: 
 confirmed 2 attributes: Dental.Insurance, Discrimination;
 still have 3 attributes left.
 29. run of importance source...
 30. run of importance source...
 31. run of importance source...
 32. run of importance source...
 33. run of importance source...
 34. run of importance source...
 35. run of importance source...
 36. run of importance source...
 37. run of importance source...
After 37 iterations, +6.6 secs: 
 confirmed 1 attribute: Health.Insurance;
 still have 2 attributes left.
 38. run of importance source...
 39. run of importance source...
 40. run of importance source...
 41. run of importance source...
 42. run of importance source...
 43. run of importance source...
 44. run of importance source...
 45. run of importance source...
 46. run of importance source...
 47. run of importance source...
 48. run of importance source...
 49. run of importance source...
 50. run of importance source...
 51. run of importance source...
 52. run of importance source...
 53. run of importance source...
 54. run of importance source...
 55. run of importance source...
 56. run of importance source...
 57. run of importance source...
 58. run of importance source...
 59. run of importance source...
 60. run of importance source...
 61. run of importance source...
 62. run of importance source...
 63. run of importance source...
 64. run of importance source...
 65. run of importance source...
 66. run of importance source...
 67. run of importance source...
 68. run of importance source...
 69. run of importance source...
 70. run of importance source...
 71. run of importance source...
 72. run of importance source...
 73. run of importance source...
 74. run of importance source...
 75. run of importance source...
 76. run of importance source...
 77. run of importance source...
 78. run of importance source...
 79. run of importance source...
 80. run of importance source...
 81. run of importance source...
 82. run of importance source...
 83. run of importance source...
After 83 iterations, +14 secs: 
 rejected 1 attribute: Identify.Ethnically;
 still have 1 attribute left.
 84. run of importance source...
 85. run of importance source...
 86. run of importance source...
 87. run of importance source...
 88. run of importance source...
 89. run of importance source...
 90. run of importance source...
 91. run of importance source...
 92. run of importance source...
 93. run of importance source...
 94. run of importance source...
 95. run of importance source...
 96. run of importance source...
 97. run of importance source...
 98. run of importance source...
 99. run of importance source...
plot(pc_boruta,las=2,cex.axis=0.5)

pc_boruta
Boruta performed 99 iterations in 16.96238 secs.
 14 attributes confirmed important: Age, Dental.Insurance,
Discrimination, Duration.of.Residency, English.Difficulties and 9 more;
 3 attributes confirmed unimportant: Belonging, Gender,
Identify.Ethnically;
 1 tentative attributes left: US.Born;
pc_boruta$finalDecision
                     Ethnicity                            Age 
                     Confirmed                      Confirmed 
                        Gender                       Religion 
                      Rejected                      Confirmed 
          Full.Time.Employment                  Income_median 
                     Confirmed                      Confirmed 
                       US.Born          Duration.of.Residency 
                     Tentative                      Confirmed 
              Primary.Language               English.Speaking 
                     Confirmed                      Confirmed 
          English.Difficulties       Familiarity.with.America 
                     Confirmed                      Confirmed 
Familiarity.with.Ethnic.Origin            Identify.Ethnically 
                     Confirmed                       Rejected 
                     Belonging                 Discrimination 
                      Rejected                      Confirmed 
              Health.Insurance               Dental.Insurance 
                     Confirmed                      Confirmed 
Levels: Tentative Confirmed Rejected

Naive Bayes Classifier

fitControl <- trainControl(## 10-fold CV
                           method = "repeatedcv",
                           number = 10,
                           ## repeated ten times
                           repeats = 10)


cl <- makePSOCKcluster(10)
registerDoParallel(cl)
set.seed(825)
rf_fit <- train(Folkmedicine ~ ., data = rfdata, 
                 method = "naive_bayes", 
                 trControl = fitControl,
                 ## This last option is actually one
                 ## for gbm() that passes through
                 verbose = FALSE)
Warning: In density.default(x, na.rm = TRUE, ...) :
 extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
 extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
 extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
 extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
 extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
 extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
 extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
 extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
 extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
 extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
 extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
 extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
 extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
 extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
 extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
 extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
 extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
 extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
 extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
 extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
 extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
 extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
 extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
 extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
 extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
 extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
 extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
 extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
 extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
 extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
 extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
 extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
 extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
 extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
 extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
 extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
 extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
 extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
 extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
 extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
 extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
 extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
 extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
 extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
 extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
 extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
 extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
 extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
 extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
 extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
 extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
 extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
 extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
 extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
 extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
 extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
 extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
 extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
 extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
 extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
 extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
 extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
 extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
 extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
 extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
 extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
 extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
 extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
 extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
 extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
 extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
 extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
 extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
 extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
 extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
 extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
 extra argument 'verbose' will be disregarded
Warning: In density.default(x, na.rm = TRUE, ...) :
 extra argument 'verbose' will be disregarded
stopCluster(cl)
varImp(rf_fit)
ROC curve variable importance

                               Importance
Age                               100.000
Duration.of.Residency              61.198
Full.Time.Employment               49.606
Belonging                          40.543
Familiarity.with.America           39.330
Religion                           39.037
Primary.Language                   35.805
Gender                             30.127
Ethnicity                          29.581
Discrimination                     24.699
Identify.Ethnically                22.885
English.Speaking                   14.384
US.Born                            10.342
Familiarity.with.Ethnic.Origin      4.550
Dental.Insurance                    2.944
Health.Insurance                    2.618
English.Difficulties                0.779
Income_median                       0.000